"""
@Filename       : analysis_word.py
@Create Time    : 2021/4/24 16:04
@Author         : Rylynn
@Description    : 

"""
from utils.dataset import LocationTweetDataset

dataset = 'cmu'

train_dataset = LocationTweetDataset(dataset, suffix='train')
test_dataset = LocationTweetDataset(dataset, suffix='test')
valid_dataset = LocationTweetDataset(dataset, suffix='valid')

train_word_set = set()
test_word_set = set()
valid_word_set = set()

for tweets, _ in train_dataset:
    for tweet in tweets:
        for word in tweet:
            train_word_set.add(word)

for tweets, _ in test_dataset:
    for tweet in tweets:
        for word in tweet:
            test_word_set.add(word)

for tweets, _ in valid_dataset:
    for tweet in tweets:
        for word in tweet:
            valid_word_set.add(word)


print('train: ', len(train_word_set))
print('test: ', len(test_word_set))
print('valid: ', len(valid_word_set))

print('train and test: ', len(train_word_set & test_word_set))

print('train and valid: ', len(train_word_set & valid_word_set))

print('test and valid: ', len(test_word_set & valid_word_set))

print(len(train_word_set|test_word_set|valid_word_set))

